
#include "alloc.h"
#include <string.h>
#include <ctype.h>
#include "html.h"
#define LOG_TAG "HtmlItem"
#include "log.h"

/********************** HtmlItem ****************************/

static bool HtmlItemNoEndLabel(String str) {
    // 可省略或没有结束标签的元素
    const char *label[] = {"base", "link", "meta", "hr", "br", "wbr", "img", "embed",
                           "param", "source", "track", "area", "col", "input", "keygen",
                           "menuitem"};
    int i, count = sizeof(label) / sizeof(label[0]);
    for (i = 0; i < count; i++) {
        if (StringIgnoreCaseEqual(str, label[i])) {
            return true;
        }
    }
    return false;
}

static bool HtmlItemFindEndLabel(String str, String name) {
    int index = StringSubIndex(str, name);
    while (index >= 0) {
        char c = str->ptr[index - 1];
        if (c == ' ' || c == '/') {
            c = str->ptr[index + name->len];
            if (c == '>')  return true;
        }
        index += name->len;
        StringMove(str, index);
        index = StringSubIndex(str, name);
    }
    return false;
}

static bool HtmlItemPauseAllAttr(XmlItem xml, Input in)
{
    // 解析元素全部属性
    InputSkipSpace(in);
    while (in->begin < in->end) {
        if (in->ptr[in->begin] == '>') {
            in->begin++;
            if (xml->name.ptr[0] == '/') {
                // 如果元素只有结束标签,直接结束
                return false;
            } else if (HtmlItemNoEndLabel(&xml->name)) {
                // 如果是没有结束标签的元素,直接结束
                return false;
            }
            // 元素起始标签结束
            break;
        } else if (in->ptr[in->begin] == '/' && in->ptr[in->begin + 1] == '>') {
            // 标签没有内容(自闭合),直接结束
            in->begin += 2;
            return false;
        }
        XmlItemPauseAttr(xml, in);
        InputSkipSpace(in);
    }
    return true;
}

static bool HtmlItemPauseEndLabel(XmlItem xml, Input in)
{
    // 获取结束标签中的元素名
    StringEty str;
    Input2String(in, &str);
    SetXmlStringEnd(&str);

    // 与本元素的元素名匹配
    if (StringCaseEqual(&str, &xml->name)) {
        // 元素名匹配成功,该函数执行正确结束
        in->begin += str.len + 1;
        return true;
    } else if (HtmlItemNoEndLabel(&str) || StringIgnoreCaseEqual(&xml->name, "script")) {
        // 如果是没有结束标签的元素,错误的把起始标签写成结束标签,跳过
        // script 元素必须有结束标签,跳过
        in->begin += str.len + 1;
        return false;
    } else if (StringIgnoreCaseEqual(&xml->name, "p") && StringIgnoreCaseEqual(&str, "div")) {
        // p 元素不能包含块级元素,该结束标签属于父元素,结束
        in->begin -= 2;
        return true;
    } else if (xml->parent != NULL && StringCaseEqual(&str, &xml->parent->name)) {
        // 与父元素的元素名匹配,该结束标签属于父元素,结束
        in->begin -= 2;
        return true;
    } else {
        // 向下搜寻本元素的结束标签,如果找到则废弃该结束标签
        int len = str.len;
        Input2String(in, &str);
        if (HtmlItemFindEndLabel(&str, &xml->name)) {
            in->begin += len + 1;
            return false;
        }
    }
    // 该结束标签属于父元素,本元素省略了结束标签,结束
    LOGW("%s(%d): %.*s label no end", __func__, __LINE__, xml->name.len, xml->name.ptr);
    in->begin -= 2;
    return true;
}

XmlItem HtmlItemPause(Input in, XmlItem parent)
{
    InputSkipSpace(in);
    // 检查标签的起始符号'<'
    if (in->ptr[in->begin] != '<') {
        LOGE("%s(%d): String don't begin with '<'", __func__, __LINE__);
        in->begin = in->end;
        return NULL;
    }

    XmlItem xml = malloc_ct(XmlItemEty, 1);
    if (xml == NULL) {
        LOGE("Memory exhausted");
        in->begin = in->end;
        return NULL;
    }
    memset(xml, 0, sizeof(XmlItemEty));
    xml->parent = parent;
    StringSetBegin(&xml->range, in);

    // 解析注释和声明
    if (in->ptr[in->begin + 1] == '!') {
        XmlItemPauseNote(xml, in);
        StringSetEnd(&xml->range, in);
        return xml;
    } else if (in->ptr[in->begin + 1] == '?') {
        XmlItemPauseState(xml, in);
        StringSetEnd(&xml->range, in);
        return xml;
    }

    // 获取元素名
    in->begin++;
    Input2String(in, &xml->name);
    SetXmlStringEnd(&xml->name);
    in->begin += xml->name.len;

    // 解析元素属性
    if (!HtmlItemPauseAllAttr(xml, in)) {
        StringSetEnd(&xml->range, in);
        return xml;
    }

    // 解析元素内容
    StringSetBegin(&xml->val, in);
    while (in->begin < in->end) {
        InputSkipSpace(in);
        if (in->ptr[in->begin] != '<') {
            xml->hasFreeVal = true;
            // 寻找下一标签的起始符合'<'
            StringEty str;
            Input2String(in, &str);
            int index = StringIndex(&str, '<');
            if (index < 0) {
                xml->val.len = (int) (in->ptr + in->end - xml->val.ptr);
                LOGW("%s(%d): %.*s item's name can't find end", __func__, __LINE__,
                    xml->name.len, xml->name.ptr);
                in->begin = in->end;
                return xml;
            }
            // 跳转到下一标签的起始位置
            in->begin += index;
            continue;
        }

        // 解析到一个结束标签
        if (in->ptr[in->begin + 1] == '/') {
            StringSetEnd(&xml->val, in);
            in->begin += 2;

            if (!HtmlItemPauseEndLabel(xml, in)) continue;
            StringSetEnd(&xml->range, in);
            return xml;
        } else if (!isalpha(in->ptr[in->begin + 1])) {
            xml->hasFreeVal = true;
            in->begin++;
            continue;
        }

        // 该标签为起始标签
        if (StringIgnoreCaseEqual(&xml->name, "script")) {
            // script 元素没有子元素,跳过
            in->begin++;
            continue;
        } else if (StringIgnoreCaseEqual(&xml->name, "p")) {
            // p 元素不能包含块级元素,检查
            // 获取起始标签中的元素名
            StringEty str;
            Input2String(in, &str);
            StringMove(&str, 1);
            SetXmlStringEnd(&str);
            if (StringIgnoreCaseEqual(&str, "div") || StringIgnoreCaseEqual(&str, "p")) {
                StringSetEnd(&xml->val, in);
                StringSetEnd(&xml->range, in);
                return xml;
            }
        } else if (StringIgnoreCaseEqual(&xml->name, "li")) {
            // li 元素不能包含li元素,检查
            // 获取起始标签中的元素名
            StringEty str;
            Input2String(in, &str);
            StringMove(&str, 1);
            SetXmlStringEnd(&str);
            if (StringIgnoreCaseEqual(&str, "li")) {
                StringSetEnd(&xml->val, in);
                StringSetEnd(&xml->range, in);
                return xml;
            }
        }
        // 解析到一个起始标签,解析子元素
        XmlItemListAdd(&xml->sub, HtmlItemPause(in, xml));
    }
    in->begin = in->end;
    return xml;
}

/********************** HtmlDoc ****************************/

static void HtmlParseBody(HtmlDoc doc)
{
    XmlItem html = NULL;
    int i, subCnt = doc->sub.count;
    for (i = 0; i < subCnt; i++) {
        String name = &(doc->sub.list[i]->name);
        if (StringIgnoreCaseEqual(name, "html")) {
            html = doc->sub.list[i];
        } else if (StringIgnoreCaseEqual(name, "head")) {
            doc->head = doc->sub.list[i];
        } else if (StringIgnoreCaseEqual(name, "body")) {
            doc->body = doc->sub.list[i];
        }
    }

    if (html == NULL) {
        if (doc->head == NULL && doc->body == NULL) {
            LOGE("%s(%d): html label not find", __func__, __LINE__);
        }
        return;
    }
    if (doc->head == NULL) {
        doc->head = XmlItemSub(html, StringFrom("head"));
        if (doc->head == NULL) {
            doc->head = html;
        }
    }
    if (doc->body == NULL) {
        doc->body = XmlItemSub(html, StringFrom("body"));
        if (doc->body == NULL) {
            doc->body = XmlItemSub(doc->head, StringFrom("body"));
            if (doc->body == NULL) {
                doc->body = html;
            }
        }
    }
}

void HtmlPause(HtmlDoc doc, Input in)
{
    memset(doc, 0, sizeof(*doc));

    InputSkipBOM(in);
    while (in->begin < in->end) {
        XmlItemListAdd(&doc->sub, HtmlItemPause(in, NULL));
        InputSkipSpace(in);
    }

    // 解析Html文档的head,body结构
    HtmlParseBody(doc);
    if (doc->body == NULL) return;

    // 防止多个body标签嵌套
    XmlItem *lst = doc->body->sub.list;
    int i, subCnt = doc->body->sub.count;
    for (i = 0; i < subCnt; i++) {
        if (!StringIgnoreCaseEqual(&lst[i]->name, "body")) {
            continue;
        }
        LOGW("%s(%d): There has a SubBody", __func__, __LINE__);
        doc->body = lst[i];

        i = -1;
        lst = doc->body->sub.list;
        subCnt = doc->body->sub.count;
    }
}

void HtmlFree(HtmlDoc doc)
{
    int i;
    for (i = 0; i < doc->sub.count; i++) {
        XmlItemFree(doc->sub.list[i]);
    }
    free_ct(doc->sub.list);
    memset(doc, 0, sizeof(*doc));
}

bool HtmlCharset(HtmlDoc doc, String charset)
{
    if (doc->head == NULL) {
        LOGE("%s(%d): head item not find", __func__, __LINE__);
        return false;
    }

    XmlItem *lst = doc->head->sub.list;
    int i, subCnt = doc->head->sub.count;
    for (i = 0; i < subCnt; i++) {
        if (!StringIgnoreCaseEqual(&lst[i]->name, "meta")) continue;
        String str = StringMapGet(&lst[i]->attr, StringFrom("charset"));
        if (str != NULL) {
            StringClone(charset, str);
            return true;
        }
        str = StringMapGet(&lst[i]->attr, StringFrom("http-equiv"));
        if (str == NULL) {
            str = StringMapGet(&lst[i]->attr, StringFrom("https-equiv"));
        }
        if (str != NULL && StringIgnoreCaseEqual(str, "Content-Type")) {
            break;
        }
    }
    if (i >= subCnt) return false;

    String value = StringMapGet(&lst[i]->attr, StringFrom("content"));
    if (value == NULL) {
        LOGE("%s(%d): Content-Type's content attributes not find", __func__, __LINE__);
        return false;
    }
    int index = StringSubIndex(value, StringFrom("charset="));
    if (index  < 0) {
        LOGE("%s(%d): charset not find", __func__, __LINE__);
        return false;
    }
    index += 8;

    charset->ptr = value->ptr + index;
    int len = value->len - index;
    for (i = 0; i < len; i++) {
        if (charset->ptr[i] == ' ' || charset->ptr[i] == ';') {
            break;
        }
    }
    charset->len = i;
    return true;
}

bool HtmlTitle(HtmlDoc doc, String title)
{
    if (doc->head == NULL) {
        LOGE("%s(%d): head item not find", __func__, __LINE__);
        return false;
    }

    XmlItem item = XmlItemListGet(&doc->head->sub, StringFrom("title"));
    if (item == NULL) return false;
    return XmlItemValue(item, title);
}

void HtmlDiff(HtmlDoc doc, HtmlDoc doc2, XmlItemList lst)
{
    lst->list = NULL;
    lst->count = 0;
    XmlItemDiff(&doc->sub, &doc2->sub, lst);
}
